In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import os
import scipy as sp
from scipy import stats

from tools.plt import color2d #from the 'srcole/tools' repo
from matplotlib import cm
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 100

Load dataframes


In [2]:
# Load cities info
df_cities = pd.read_csv('/gh/data2/yelp/city_pop.csv', index_col=0)
df_cities.head()


Out[2]:
city state population total_food latitude longitude total_scraped
0 New York New York 8537673 54191 40.705445 -73.994293 1000
1 Los Angeles California 3976322 41685 34.061590 -118.321381 1000
2 Chicago Illinois 2704958 19315 41.905159 -87.677765 1000
3 Houston Texas 2303482 15197 29.784854 -95.359955 1000
4 Phoenix Arizona 1615017 11034 33.465086 -112.070160 1000

In [3]:
# Load restaurants
df_restaurants = pd.read_csv('/gh/data2/yelp/food_by_city/df_restaurants.csv', index_col=0)
df_restaurants.head()


Out[3]:
id name city state rating review_count cost latitude longitude has_delivery has_pickup url
0 poquito-picante-brooklyn-2 Poquito Picante New York New York 4.5 40 2 40.685742 -73.981262 True True https://www.yelp.com/biz/poquito-picante-brook...
1 nourish-brooklyn-4 Nourish New York New York 4.0 65 2 40.677960 -73.968550 True True https://www.yelp.com/biz/nourish-brooklyn-4?ad...
2 taste-of-heaven-brooklyn Taste of Heaven New York New York 5.0 19 2 40.717150 -73.940540 False True https://www.yelp.com/biz/taste-of-heaven-brook...
3 milk-and-cream-cereal-bar-new-york Milk & Cream Cereal Bar New York New York 4.5 307 2 40.719580 -73.996540 False False https://www.yelp.com/biz/milk-and-cream-cereal...
4 the-bao-shoppe-new-york-2 The Bao Shoppe New York New York 4.0 99 1 40.714345 -73.990518 False False https://www.yelp.com/biz/the-bao-shoppe-new-yo...

In [4]:
# Load categories by restaurant
df_categories = pd.read_pickle('/gh/data2/yelp/food_by_city/df_categories_sparse.pkl')
df_categories.head()


Out[4]:
acaibowls accessories active acupuncture adultedu advertising aerialfitness afghani african airport_shuttles airportlounges amateursportsteams amusementparks animalshelters antiques apartments appliances aquariums arabian arcades archery argentine armenian artclasses artmuseums arts artsandcrafts artschools artsupplies arttours asianfusion attractionfarms auctionhouses australian austrian auto auto_detailing autocustomization autopartssupplies autorepair ayurveda baby_gear bagels bakeries bangladeshi banks barbers barcrawl bars bartenders ... truckrepair turkish tuscan ukrainian university_housing unofficialyelpevents usedbooks uzbek vacation_rentals vapeshops vegan vegetarian venezuelan venues vermouthbars vet videoandgames videofilmproductions videogamestores vietnamese vintage vinyl_records virtualrealitycenters visitorcenters vitaminssupplements waffles walkingtours watches waterdelivery waterparks waterpurification waterstores waxing web_design wedding_planning weddingchappels weightlosscenters whiskeybars wholesale_stores wholesalers wine_bars wineries winetasteclasses winetastingroom winetours womenscloth wraps yelpevents yoga zoos
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

5 rows × 684 columns


In [5]:
# These are used for the 'category' input to the search function
df_categories_info = pd.read_json('/gh/data2/yelp/categories.json')
df_categories_info.head()


Out[5]:
alias country_blacklist country_whitelist parents title
0 3dprinting NaN NaN [localservices] 3D Printing
1 abruzzese NaN [IT] [italian] Abruzzese
2 absinthebars NaN [CZ] [bars] Absinthe Bars
3 acaibowls [AR, PL, TR, MX, CL, IT] NaN [food] Acai Bowls
4 accessories NaN NaN [fashion] Accessories

Cuisines by city


In [6]:
# New dataframe: For each cuisine, compute the average rating, average price, and # restaurants
all_cuisines = df_categories.keys()
cuisine_dict = {'cuisine': [],
                'avg_rating': [],
                'avg_cost': [],
                'N': []}
for k in all_cuisines:
    df_temp = df_restaurants[df_categories[k]==1]
    cuisine_dict['cuisine'].append(k)
    cuisine_dict['avg_rating'].append(df_temp['rating'].mean())
    cuisine_dict['avg_cost'].append(df_temp['cost'].mean())
    cuisine_dict['N'].append(len(df_temp))
df_cuisine = pd.DataFrame.from_dict(cuisine_dict)

# Determine cuisines of interest
# Only look at cuisines with at least 2000 restaurants
min_N = 1000
category_counts = df_categories.sum()
categories_keep = category_counts[category_counts > min_N]
cuisines_rmv = ['bars', 'beer_and_wine', 'beerbar', 'breweries', 'butcher', 'cafes', 'catering',
                'chickenshop', 'cocktailbars', 'convenience', 'cosmetics', 'customcakes',
                'deptstores', 'divebars', 'drugstores', 'eventplanning', 'farmersmarket', 'fooddeliveryservices',
                'foodstands', 'gastropubs', 'gourmet', 'grocery', 'healthmarkets', 'importedfood', 'intlgrocery',
                'karaoke', 'lounges', 'markets', 'meats', 'musicvenues', 'personalchefs', 'pubs',
                'restaurants', 'salvadoran', 'seafoodmarkets', 'servicestations', 'sportsbars', 'streetvendors',
                'tapasmallplates', 'venues', 'wine_bars', 'wineries']
categories_keep.drop(cuisines_rmv, inplace=True)
categories_keep = categories_keep.keys()

In [9]:
df_categories.loc[:,categories_keep]


-----------------------------------------------------------------------
KeyboardInterrupt                     Traceback (most recent call last)
<ipython-input-9-2139667b596b> in <module>()
----> 1 df_categories.loc[:,categories_keep]

~/anaconda/lib/python3.6/site-packages/IPython/core/displayhook.py in __call__(self, result)
    255             self.start_displayhook()
    256             self.write_output_prompt()
--> 257             format_dict, md_dict = self.compute_format_data(result)
    258             self.update_user_ns(result)
    259             self.fill_exec_result(result)

~/anaconda/lib/python3.6/site-packages/IPython/core/displayhook.py in compute_format_data(self, result)
    149 
    150         """
--> 151         return self.shell.display_formatter.format(result)
    152 
    153     # This can be set to True by the write_output_prompt method in a subclass

~/anaconda/lib/python3.6/site-packages/IPython/core/formatters.py in format(self, obj, include, exclude)
    178             md = None
    179             try:
--> 180                 data = formatter(obj)
    181             except:
    182                 # FIXME: log the exception

<decorator-gen-10> in __call__(self, obj)

~/anaconda/lib/python3.6/site-packages/IPython/core/formatters.py in catch_format_error(method, self, *args, **kwargs)
    222     """show traceback on failed format call"""
    223     try:
--> 224         r = method(self, *args, **kwargs)
    225     except NotImplementedError:
    226         # don't warn on NotImplementedErrors

~/anaconda/lib/python3.6/site-packages/IPython/core/formatters.py in __call__(self, obj)
    700                 type_pprinters=self.type_printers,
    701                 deferred_pprinters=self.deferred_printers)
--> 702             printer.pretty(obj)
    703             printer.flush()
    704             return stream.getvalue()

~/anaconda/lib/python3.6/site-packages/IPython/lib/pretty.py in pretty(self, obj)
    393                             if callable(meth):
    394                                 return meth(obj, self, cycle)
--> 395             return _default_pprint(obj, self, cycle)
    396         finally:
    397             self.end_group()

~/anaconda/lib/python3.6/site-packages/IPython/lib/pretty.py in _default_pprint(obj, p, cycle)
    508     if _safe_getattr(klass, '__repr__', None) is not object.__repr__:
    509         # A user-provided repr. Find newlines and replace them with p.break_()
--> 510         _repr_pprint(obj, p, cycle)
    511         return
    512     p.begin_group(1, '<')

~/anaconda/lib/python3.6/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
    699     """A pprint that just redirects to the normal repr function."""
    700     # Find newlines and replace them with p.break_()
--> 701     output = repr(obj)
    702     for idx,output_line in enumerate(output.splitlines()):
    703         if idx:

~/anaconda/lib/python3.6/site-packages/pandas/core/base.py in __repr__(self)
     78         Yields Bytestring in Py2, Unicode String in py3.
     79         """
---> 80         return str(self)
     81 
     82 

~/anaconda/lib/python3.6/site-packages/pandas/core/base.py in __str__(self)
     57 
     58         if compat.PY3:
---> 59             return self.__unicode__()
     60         return self.__bytes__()
     61 

~/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in __unicode__(self)
    634             width = None
    635         self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
--> 636                        line_width=width, show_dimensions=show_dimensions)
    637 
    638         return buf.getvalue()

~/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, line_width, max_rows, max_cols, show_dimensions)
   1672                                            max_rows=max_rows,
   1673                                            max_cols=max_cols,
-> 1674                                            show_dimensions=show_dimensions)
   1675         formatter.to_string()
   1676 

~/anaconda/lib/python3.6/site-packages/pandas/io/formats/format.py in __init__(self, frame, buf, columns, col_space, header, index, na_rep, formatters, justify, float_format, sparsify, index_names, line_width, max_rows, max_cols, show_dimensions, decimal, **kwds)
    416             self.columns = frame.columns
    417 
--> 418         self._chk_truncate()
    419         self.adj = _get_adjustment()
    420 

~/anaconda/lib/python3.6/site-packages/pandas/io/formats/format.py in _chk_truncate(self)
    487             else:
    488                 row_num = max_rows_adj // 2
--> 489                 frame = concat((frame.iloc[:row_num, :],
    490                                 frame.iloc[-row_num:, :]))
    491             self.tr_row_num = row_num

~/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1365             except (KeyError, IndexError):
   1366                 pass
-> 1367             return self._getitem_tuple(key)
   1368         else:
   1369             # we by definition only have the 0th axis

~/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
   1751                 continue
   1752 
-> 1753             retval = getattr(retval, self.name)._getitem_axis(key, axis=axis)
   1754 
   1755             # if the dim was reduced, then pass a lower-dim the next time

~/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1803         if isinstance(key, slice):
   1804             self._has_valid_type(key, axis)
-> 1805             return self._get_slice_axis(key, axis=axis)
   1806 
   1807         if isinstance(key, list):

~/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _get_slice_axis(self, slice_obj, axis)
   1772         slice_obj = self._convert_slice_indexer(slice_obj, axis)
   1773         if isinstance(slice_obj, slice):
-> 1774             return self._slice(slice_obj, axis=axis, kind='iloc')
   1775         else:
   1776             return self.obj._take(slice_obj, axis=axis, convert=False)

~/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _slice(self, obj, axis, kind)
    153         if axis is None:
    154             axis = self.axis
--> 155         return self.obj._slice(obj, axis=axis, kind=kind)
    156 
    157     def _get_setitem_indexer(self, key):

~/anaconda/lib/python3.6/site-packages/pandas/core/sparse/frame.py in _slice(self, slobj, axis, kind)
    511             new_columns = self.columns[slobj]
    512 
--> 513         return self.reindex(index=new_index, columns=new_columns)
    514 
    515     def xs(self, key, axis=0, copy=False):

~/anaconda/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    125         @wraps(func)
    126         def wrapper(*args, **kwargs):
--> 127             return func(*args, **kwargs)
    128 
    129         if not PY2:

~/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
   2933         kwargs.pop('axis', None)
   2934         kwargs.pop('labels', None)
-> 2935         return super(DataFrame, self).reindex(**kwargs)
   2936 
   2937     @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)

~/anaconda/lib/python3.6/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
   3021         # perform the reindex on the axes
   3022         return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3023                                   fill_value, copy).__finalize__(self)
   3024 
   3025     def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,

~/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   2868         if index is not None:
   2869             frame = frame._reindex_index(index, method, copy, level,
-> 2870                                          fill_value, limit, tolerance)
   2871 
   2872         return frame

~/anaconda/lib/python3.6/site-packages/pandas/core/sparse/frame.py in _reindex_index(self, index, method, copy, level, fill_value, limit, takeable)
    669             values = series.values
    670             # .take returns SparseArray
--> 671             new = values.take(indexer)
    672             if need_mask:
    673                 new = new.values

~/anaconda/lib/python3.6/site-packages/pandas/core/sparse/array.py in take(self, indices, axis, allow_fill, fill_value, **kwargs)
    509             indices[indices < 0] += n
    510 
--> 511         locs = self.sp_index.lookup_array(indices)
    512         indexer = np.arange(len(locs), dtype=np.int32)
    513         mask = locs != -1

KeyboardInterrupt: 

IN THIS NOTEBOOK AND OTHERS, NEED TO FIX HOW USE DF_CAT IN BELOW DF NOW THAT ITS SPARSE


In [ ]:
# Set up dataframe for restaurants with categories of interest
restaurant_have_category = df_categories.loc[:,categories_keep].sum(axis=1).to_dict()
df_restaurants_keep_idx = [k for k in restaurant_have_category.keys() if restaurant_have_category[k]]
df_restaurants_temp = df_restaurants.loc[df_restaurants_keep_idx].reset_index(drop=True)
df_categories_temp = df_categories.loc[df_restaurants_keep_idx,categories_keep].reset_index(drop=True)
df_restaurants_temp = df_restaurants_temp.merge(df_categories_temp, left_index=True, right_index=True)

In [ ]:
# Compute fraction of each cuisine by city
df_city_cuisines = df_restaurants_temp.groupby('city').mean()
df_state_cuisines = df_restaurants_temp.groupby('state').mean()

Explore features by city

  • rating, review_count, cost, has_delivery, has_pickup
  • each cuisine

In [ ]:
df_city_cuisines.head(5)

Highest average rating

  • Highest average rating are the most popular cities because yelp will return the top ones in each city

In [ ]:
df_city_cuisines.sort_values('rating', ascending=False, inplace=True)

N=60
plt.figure(figsize=(30,5))
plt.bar(np.arange(N), df_city_cuisines['rating'].values[:N], color='k', ecolor='.5')
plt.xticks(np.arange(N), df_city_cuisines.index[:N])
plt.ylabel('Average rating', size=20)
plt.xlabel('City', size=20)
plt.xticks(size=15, rotation='vertical')
# plt.yticks([10**3, 10**4, 10**5], size=15)
plt.ylim((3.5, 4.5))
plt.xlim((-1, N))

boba


In [ ]:
c = 'bubbletea'
df_city_cuisines.sort_values(c, ascending=False, inplace=True)

N=60
plt.figure(figsize=(30,5))
plt.bar(np.arange(N), df_city_cuisines[c].values[:N], color='k', ecolor='.5')
plt.xticks(np.arange(N), df_city_cuisines.index[:N])
plt.ylabel('Fraction of restaurants are\n'+c, size=20)
plt.xlabel('City', size=20)
plt.xticks(size=15, rotation='vertical')
plt.xlim((-1, N))

In [ ]:
c = 'mexican'
df_state_cuisines.sort_values(c, ascending=False, inplace=True)

plt.figure(figsize=(30,5))
plt.bar(np.arange(len(df_state_cuisines)), df_state_cuisines[c].values, color='k', ecolor='.5')
plt.xticks(np.arange(len(df_state_cuisines)), df_state_cuisines.index)
plt.ylabel('Fraction of restaurants are\n'+c, size=20)
plt.xlabel('State', size=20)
plt.xticks(size=15, rotation='vertical')
plt.xlim((-1, len(df_state_cuisines)))

In [ ]:
c = 'italian'
df_state_cuisines.sort_values(c, ascending=False, inplace=True)

plt.figure(figsize=(30,5))
plt.bar(np.arange(len(df_state_cuisines)), df_state_cuisines[c].values, color='k', ecolor='.5')
plt.xticks(np.arange(len(df_state_cuisines)), df_state_cuisines.index)
plt.ylabel('Fraction of restaurants are\n'+c, size=20)
plt.xlabel('State', size=20)
plt.xticks(size=15, rotation='vertical')
plt.xlim((-1, len(df_state_cuisines)))